Нужны эти пакеты
Откроем библиотеки
library("devtools")
library(gdsfmt)
library(SNPRelate)
library(calibrate)
library(vcfR)
У нас было 22 файла после SNP-коллинга, мы их свернули в один, предварительно удалив шапку и заменив название колонки sample1 на соответствующее название штамма.
индексируем
Запихиваем проиндексированные vcf в один файл
Посмотрим что получилось
# the VCF file
vcf.fn <- "/home/daria/Daria/BI/Klebs/vcf/COMBINED.vcf"
snpgdsVCF2GDS(vcf.fn, "CCOMBO.gds")
## VCF Format ==> SNP GDS Format
## Method: exacting biallelic SNPs
## Number of samples: 22
## Parsing "/home/daria/Daria/BI/Klebs/vcf/COMBINED.vcf" ...
## import 59585 variants.
## + genotype { Bit2 22x59585, 320.0K } *
## Optimize the access efficiency ...
## Clean up the fragments of GDS file:
## open the file 'CCOMBO.gds' (445.2K)
## # of fragments: 49
## save to 'CCOMBO.gds.tmp'
## rename 'CCOMBO.gds.tmp' (444.8K, reduced: 348B)
## # of fragments: 20
# open an example dataset (HapMap)
genofile <- snpgdsOpen("/home/daria/Daria/BI/Klebs/CCOMBO.gds") ## open the gate
genofile
## File: /home/daria/Daria/BI/Klebs/CCOMBO.gds (444.8K)
## + [ ] *
## |--+ sample.id { Str8 22 LZMA_ra(45.0%), 157B }
## |--+ snp.id { Int32 59585 LZMA_ra(6.95%), 16.2K }
## |--+ snp.rs.id { Str8 59585 LZMA_ra(0.26%), 161B }
## |--+ snp.position { Int32 59585 LZMA_ra(33.7%), 78.4K }
## |--+ snp.chromosome { Str8 59585 LZMA_ra(0.04%), 309B }
## |--+ snp.allele { Str8 59585 LZMA_ra(11.7%), 27.3K }
## |--+ genotype { Bit2 22x59585, 320.0K } *
## \--+ snp.annot [ ]
## |--+ qual { Float32 59585 LZMA_ra(0.08%), 193B }
## \--+ filter { Str8 59585 LZMA_ra(0.07%), 205B }
class(genofile)
## [1] "SNPGDSFileClass" "gds.class"
# "SNPGDSFileClass" "gds.class"
pca <- snpgdsPCA(genofile, sample.id = NULL, snp.id = NULL, autosome.only = FALSE,
remove.monosnp = FALSE, maf = NaN, missing.rate = NaN, eigen.cnt = 32,
num.thread = 6, bayesian = FALSE, need.genmat = FALSE,
genmat.only = FALSE, verbose = TRUE)
## Principal Component Analysis (PCA) on genotypes:
## Working space: 22 samples, 59,585 SNPs
## using 6 (CPU) cores
## PCA: the sum of all selected genotypes (0,1,2) = 0
## CPU capabilities: Double-Precision SSE2
## Wed Nov 14 03:57:17 2018 (internal increment: 34068)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed in 0s
## Wed Nov 14 03:57:17 2018 Begin (eigenvalues and eigenvectors)
## Wed Nov 14 03:57:17 2018 Done.
tab <- data.frame(sample.id = pca$sample.id, EV1 = pca$eigenvect[,1], EV2 = pca$eigenvect[,2],stringsAsFactors = FALSE)
head(tab) ## слишком мелкие значения
## sample.id EV1 EV2
## 1 SRR6480136_095 7.117494e-67 1.907609e-28
## 2 SRR6480137_095 5.888024e-38 1.907609e-28
## 3 SRR6480138_095 1.863011e-264 2.168969e-28
## 4 SRR6480139_095 4.203704e+06 9.716666e-67
## 5 SRR6480140_095 1.087300e-66 1.482646e-71
## 6 SRR6480141_095 9.251745e+39 2.420155e-52
library(plotly)
plot_ly(tab, x = ~log2(EV1), y = ~log2(EV2), text = ~sample.id, title = "log2" )
plot_ly(tab, x = ~log10(EV1), y = ~log10(EV2), text = ~sample.id, title = "log10" )
Распределение SNP по позициям
CORR <- snpgdsPCACorr(pca, genofile, eig.which=1:4)
## SNP Correlation:
## Working space: 22 samples, 59585 SNPs
## using 1 (CPU) core
## using the top 4 eigenvectors
## Correlation: the sum of all selected genotypes (0,1,2) = 0
## Wed Nov 14 03:57:19 2018 (internal increment: 65536)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed in 0s
## Wed Nov 14 03:57:19 2018 Done.
#plot(abs(CORR$snpcorr[3,]), y = NULL, xlab="SNP Index", ylab="PC 3")
#не поддерживается почему-то при knit
# close the gate
snpgdsClose(genofile)